STM Exploration

Topic Summary

First, let’s load our STM results from Part 1.

load("~/Dropbox (UNC Charlotte)/NCStateSenateFacebook/data/stmFit.RData")
load("~/Dropbox (UNC Charlotte)/NCStateSenateFacebook/data/out.RData")

Let’s explore the size of the topics by their topic proportions.

library(stm)
plot(stmFit, type = "summary", xlim = c(0,.14), n = 5, #labeltype = "frex",
         main = "NC State Senators' Topics on Facebook", text.cex = 0.8)

I’ve assigned labels to the topics based on my interpretation of the word-topic probabilities (see next section).

topicNames <- labelTopics(stmFit)
k <- 40
topic <- data.frame(
  topicnames = c("Press Conference",
                 "Local Government",
                 "Religious Freedom",
                 "#WeAreNotThis",
                 "Positive Outlook",
                 "Pat McCrory",
                 "Legislation",
                 "Gerrymandering",
                 "Supreme Court",
                 "Voter ID/Fraud",

                 "Presidential Election",
                 "Campaign Support",
                 "Health Care",
                 "Congressional Elections",
                 "North Carolina",
                 "HB2",
                 "Constituent Newsletter",
                 "Sen Van Duyn Posts",
                 "God, Family, Freedom",
                 "Teacher Pay",

                 "#NCPOL and #NCGA",
                 "Hurricane Matthew",
                 "Redistricting",
                 "Taxes",
                 "Congratulations",
                 "Bathroom Safety",
                 "Economy/Jobs",
                 "Student/Women's Issues",
                 "Hillary Clinton",
                 "Church",

                 "Civil Rights",
                 "Energy Tax Credits",
                 "Roy Cooper",
                 "Get Out the Vote",
                 "Gun Violence",
                 "Berger Press Releases",
                 "Public Assistance",
                 "Town Hall Events",
                 "Holiday Wishes",
                 "Conservative Values"),
  TopicNumber = 1:k,
  TopicProportions = colMeans(stmFit$theta))

Interpretating the Topics

Let’s explore the words with the highest (raw) probabilities and the highest FREX (i.e., words that are most frequent and exclusive to the topic). These plots aid in the interpretation of the labels I created in the previous step (and used in the title of each plot).

par(mfrow = c(4,2),mar = c(1, 1, 2, 1))
for (i in 1:k){
  plot(stmFit, type = "labels", n = 15, 
           topics = i, main = paste0(topic$topicnames[i]," - Raw Probabilities"),
           width = 55)
  plot(stmFit, type = "labels", n = 15, 
           topics = i, main = paste0(topic$topicnames[i]," - FREX"), 
           labeltype = "frex", width = 55)
}

#shortdoc <- substr(subset(fbData$ID,out$meta$ID),1,200))
#thoughts3 <- findThoughts(stmFit, texts = shortdoc, n = 2, topics = 2)

We can also plot the exclusivity and semantic coherence, which are two metrics that measure the “interpretability” of each topic. Higher semantic coherence indicate topics that have more consistent words (more interpretable) while exclusivity measures how exclusive the words are to the topic relative to other topics (e.g. low values mean topics that are vague and share a lot of words with other topics while high values indicate words that are very unique/exclusive to the topic).

par(mfrow = c(1,1),mar = c(2, 2, 2, 2))
topicQuality(stmFit,
             documents = out$documents, 
             main = "Topic Interpretability: Exclusivity and Semantic Coherence")
##  [1] -117.69762  -99.02861 -108.36939 -113.10056 -119.04781  -83.70658
##  [7] -114.95785 -106.45754  -94.32849  -83.56333  -60.37099 -107.18429
## [13]  -85.71741  -96.02803  -97.72806  -96.74052 -133.55053 -109.95572
## [19] -131.66939  -68.16150 -170.58757 -131.58723  -89.55902  -85.80290
## [25] -137.62628  -63.21635  -80.50761 -141.80216 -144.41775 -155.46163
## [31] -106.00173 -145.81977  -82.39580  -84.88564 -156.92077  -49.98792
## [37] -141.26560 -126.25005 -165.69881 -129.67554
##  [1] 9.835140 9.854379 9.658130 9.788529 9.958288 9.982604 9.874494
##  [8] 9.775709 9.805462 9.818819 9.903764 9.816718 9.858738 9.807711
## [15] 9.940360 9.678566 9.849708 9.946512 9.778923 9.777436 9.881158
## [22] 9.791882 9.689455 9.664901 9.883178 9.892371 9.693516 9.778924
## [29] 9.574346 9.878137 9.909861 9.706640 9.881069 9.860808 9.713992
## [36] 9.889008 9.734380 9.794439 9.742642 9.959924

Estimating the Effect of Covariates

One of the main contributions of STM is to provide a mechanism to test (with statistical significance) the impact document-level covariates (e.g. party, time of post) have on the topic proportions (prevalence).

prep <- estimateEffect(1:k ~ Party + s(Time), stmFit, meta = out$meta, uncertainty = "Global")

Result <- plot(prep, "Party", method = "difference", 
                              cov.value1 = "Democratic", cov.value2 = "Republican", 
                              verbose.labels = F, 
                              ylab = "Expected Difference in Topic Probability by Party \n (with 95% Confidence Intervals)", 
                              xlab = "More Likely Republican                         Not Significant                          More Likely Democratic",
                              main = "Effect of Party on Topic Prevelance for Facebook Posts of NC State Senators",
                              xlim = c(-0.08,0.08))

# order based on Expected Topic Proportion
rank = order(unlist(Result$means))
topic <- topic[rank,]

par(mfrow = c(1,1),mar = c(6, 6, 4, 4))
STMresults <- plot(prep, "Party", method = "difference", cov.value1 = "Democratic", 
                                  cov.value2 = "Republican", 
                                  topics = topic$TopicNumber,
                                  verbose.labels = F, 
                                  ylab = "Expected Difference in Topic Probability by Party \n (with 95% Confidence Intervals)", 
                                  labeltype = "custom",
                                  xlab = "More Likely Republican                         Not Significant                          More Likely Democratic",
                                  custom.labels  = topic$topicnames, 
                                  main = "Effect of Party on Topic Prevelance for Facebook Posts of NC State Senators",
                                  xlim = c(-.08,0.08))

We can also explore the effect of a contiuous covariate like time. In this case, we use month using a b-spline to smooth out our results.

For simplicity, I removed confidence intervals and only plotted the (point) estimates.

# time
par(mfrow = c(2,2),mar = c(4,4,2,2))
for (i in 1:k){
  plot(prep, "Time", method = "continuous", topics = rank[i], rank[i], 
                      main = paste0(topic$topicnames[i],": Topic ",rank[i]),
                      printlegend = FALSE, ylab = "Exp. Topic Prob", 
                      xlab = "Time (Month, 1 = Jan 2015 to 24 = Dec 2016)", ylim = c(-0.02,0.2),
                      moderator = "Party", moderator.value = "Democratic",  linecol = "blue",
                      ci.level = 0
                      )
  plot(prep, "Time", method = "continuous", topics = rank[i], rank[i], 
                      main = paste0(topic$topicnames[i],": Topic ",rank[i]),
                      printlegend = FALSE, ylab = "Exp. Topic Prob", 
                      xlab = "Time (Month, 1 = Jan 2015 to 24 = Dec 2016)", ylim = c(-0.02,0.2),
                      moderator = "Party", moderator.value = "Republican",  linecol = "red", add = "T", 
                      ci.level = 0
  )
legend(-1, 1.9, c("Democratic", "Republican"), lwd = 2, col = c("blue", "red"))
}

Comparing Topics

We can also visualize ways to compare topics with the “perspectives” graphs.

Comparing Presidential Election

# Pres Election and Hillary Clinton
plot(stmFit, type = "perspectives", topics = c(11,29), n=30, plabels = c("Presidential Election","Hillary Clinton"))

Topic Correlations

Create Networks

Let’s create an interactive network for the topics (nodes) in which an edge represents a significant correlation between the topic. The size of the node is the prevalence (expected topic proportion) for the topic.

library(igraph); library(visNetwork)
par(mfrow = c(1,1))

mod.out.corr <- topicCorr(stmFit, cutoff = .025)

#library(corrplot)
#corrplot(mod.out.corr$cor, order="hclust", hclust.method="ward.D2", method = "circle", type = "lower", diag = F)

#mod.out.corr <- topicCorr(stmFit, method = "huge")

links2 <- as.matrix(mod.out.corr$posadj)
net2 <- graph_from_adjacency_matrix(links2, mode = "undirected")
table(V(net2)$type)
## < table of extent 0 >
net2 <- simplify(net2, remove.multiple = F, remove.loops = T) 

links <- as_data_frame(net2, what="edges")
nodes <- as_data_frame(net2, what="vertices")

# Community Detection
clp <- cluster_label_prop(net2)
nodes$community <- clp$membership

means <- as.data.frame(unlist(STMresults$means))
colnames(means) <- "means"
color <- colorRamp(c("white","blue"))(abs(means$means)/0.05)
means$colorDem <- rgb(color[,1],color[,2],color[,3],  maxColorValue=255)

color <- colorRamp(c("white","red"))(abs(means$means)/0.05)
means$colorRep <- rgb(color[,1],color[,2],color[,3],  maxColorValue=255)

means$color <- ifelse(means$means>0,means$colorDem,means$colorRep)

#visNetwork edits
nodes$shape <- "dot"  
nodes$shadow <- TRUE # Nodes will drop shadow
nodes$title <- apply(topicNames$prob, 1, function(x) paste0(x, collapse = " + "))[rank] # Text on click
nodes$label <- topic$topicnames # Node label
nodes$size <- (topic$TopicProportions / max(topic$TopicProportions)) * 40 # Node size
nodes$borderWidth <- 2 # Node border width

nodes$color.background <- means$color
nodes$color.border <- "black"
nodes$color.highlight.background <- "orange"
nodes$color.highlight.border <- "darkred"
nodes$id <- topic$TopicNumber

Network plot

visNetwork(nodes, links, width="100%",  height="600px", main="NC State Senator Topic (Correlation) Network") %>% visOptions(highlightNearest = list(enabled = TRUE, algorithm = "hierarchical")) %>% 
visInteraction(navigationButtons = TRUE)